home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Collection of Internet
/
Collection of Internet.iso
/
infosrvr
/
dev
/
libhtml_.tar
/
SGML.c
< prev
next >
Wrap
C/C++ Source or Header
|
1993-01-20
|
12KB
|
534 lines
/* SGML_stream.c
* $Id: SGMLstream.c,v 1.3 93/01/06 18:40:28 connolly Exp Locker: connolly $
*/
/* implements... */
#include "SGML.h"
/* uses ... */
#include "object.h"
#include <ctype.h>
#include <assert.h>
#include <string.h>
VOID
SGML_parseInstance(stream, getch, document, docclass)
HMStream stream;
HMGetcProc *getch;
HMDoc* document;
CONST HMDoc_Class *docclass;
{
static char RE[] = "\n";
char REbuffer[1 + SGML_LITLEN + SGML_NAMELEN + 4];
char *buffer = REbuffer + 1;
int content = SGML_MIXED;
int lookahead = EOF;
int len, read;
char gi[SGML_NAMELEN+1];
HMBinding attributes[SGML_ATTCNT];
int attrqty;
char eat_next_RE = 1, RE_pending = 0;
REbuffer[0] = '\n'; /*@@ should be 13, not 10! */
while( (read = SGML_read(stream, getch, buffer, sizeof(REbuffer) - 2,
content, &lookahead)) != EOF){
switch(read){
case SGML_start_tag:
if (RE_pending){
(docclass->data)(document, RE, 1);
}
len = SGML_read_name(stream, getch, gi, &lookahead);
gi[len] = 0;
attrqty = 0;
while(isalpha(lookahead)){ /* iterate over attributes */
len = SGML_read_name(stream, getch,
buffer, &lookahead);
buffer[len] = 0;
if(lookahead == '='){
int offset = len + 1;
HMBinding* attr = &attributes[attrqty++];
lookahead = EOF;
/* @@ entity references in attribute value */
len += SGML_read_value(stream,
getch,
buffer + offset,
&lookahead) + 1;
buffer[len++] = '\0';
attr->name = NEW(char, len);
memcpy(attr->name, buffer, len);
attr->value = attr->name + offset;
}
}
/* look for tag close */
while(isspace(lookahead))
lookahead = (getch)(stream);
lookahead = EOF;
{
int i;
int c;
c = (docclass->startTag)(document, gi, attributes, attrqty);
if(c == SGML_EMPTY){
eat_next_RE = 0;
}else{
content = c;
eat_next_RE = 1;
}
for(i=0; i<attrqty; i++)
FREE(attributes[i].name);
}
RE_pending = 0;
break;
case SGML_end_tag:
/* drop pending RE */
len = SGML_read_name(stream, getch, gi, &lookahead);
gi[len] = 0;
/* look for tag close */
while(isspace(lookahead))
lookahead = (getch)(stream);
lookahead = EOF;
(docclass->endTag)(document, gi);
content = SGML_MIXED; /* @@ could be element */
eat_next_RE = 0;
RE_pending = 0;
break;
case SGML_entity:
if (RE_pending){
(docclass->data)(document, RE, 1);
}
eat_next_RE = 0;
RE_pending = 0;
{
CONST char* text = (docclass->entityText)(document, buffer);
if(text)
(docclass->data)(document, text, strlen(text));
}
break;
case SGML_record_end:
if(eat_next_RE){
eat_next_RE = 0;
RE_pending = 0;
}
else if (RE_pending){
(docclass->data)(document, RE, 1);
}
else
RE_pending = 1;
break;
default:
buffer[read] = 0;
if(RE_pending)
(docclass->data)(document, REbuffer, read + 1);
else
(docclass->data)(document, buffer, read);
RE_pending = 0;
eat_next_RE = 0;
break;
}
}while(read != EOF);
}
/*****
* lexical analysis
*****/
int
SGML_read(stream, getch,
buf, nbytes,
content,
inout_lookahead)
HMStream stream;
HMGetcProc* getch;
char* buf;
int nbytes;
int content;
int* inout_lookahead;
{
int c; /* state machine input character */
enum { /* state machine states */
start, data, cdata, rcdata, pcdata,
and, and_hash, cref, entity,
lt, lt_slash, tag,
pi,
lt_bang, lt_bang_dash,
comment, comment_dash, ps
} state = start;
/* auxiliary state: */
int end_tag; /* saw '/' after '<' */
char name[SGML_NAMELEN + 1]; /* function character name */
int name_chars;
int ret = 0; /* number of characters read */
#define LOOKAHEAD(n) (ret + n < nbytes)
#define REDUCE(s) { state = (s); break; }
#define SHIFT(s) { state = (s); continue; }
#define DONE(c) { *inout_lookahead = (c); return ret; }
#define WRITE(c) { *buf++ = (c); ret++; }
/* prime the pump */
if((c = *inout_lookahead) == EOF)
c = (getch)(stream);
/* state machine...*/
while(ret < nbytes){
switch(state){
case start:
if(c == EOF) return EOF;
else if(c == '\n') { ret = SGML_record_end; DONE(EOF); }
else if(c == '<'){
if(LOOKAHEAD(3)) { REDUCE(lt); }
else { DONE(c); } /* no room for lookahead */
}else if(c == '&'){
if(LOOKAHEAD(2)) { REDUCE(and); }
}else if(content == SGML_ELEMENT && isspace(c)){
break; /* ignore whitespace in ELEMENT content */
}else { SHIFT(data); }
case data:
if(content == SGML_ELEMENT){
if(isspace(c)){
break;
}else{
*buf = 0; ret = 0; DONE(c);
}
}else if(content == SGML_CDATA){ SHIFT(cdata); }
else if(content == SGML_RCDATA){ SHIFT(rcdata); }
else /* assume SGML_MIXED */ { SHIFT(pcdata); }
case cdata:
if(c == EOF || c == '<' || c == '\n') { DONE(c); }
else{ WRITE(c); break; }
case rcdata:
case pcdata:
if(c == EOF || c == '<' || c == '&' || c == '\n') { DONE(c); }
else{ WRITE(c); break; }
case and:
if(c == '#') { REDUCE(and_hash); }
else if(isalpha(c)) {
if(LOOKAHEAD(SGML_NAMELEN+1)){
name_chars = 0; SHIFT(entity);
}else{
DONE(c); /* error: no room for entity name */
}
}
else{ WRITE('&'); SHIFT(data); }
case entity:
if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
WRITE(c);
break;
}
else{
WRITE('\0');
ret = SGML_entity;
if(c == ';' || c == '\n'){ DONE(EOF); /* eat ; */ }
else{ DONE(c); /* ended char ref with other char */ }
}
case and_hash:
if(isalnum(c)){ name_chars = 0; SHIFT(cref); }
else{ WRITE('&'); WRITE('#'); SHIFT(data); }
case cref:
/* auxiliary state: name_chars */
if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
if(name_chars < SGML_NAMELEN)
name[name_chars++] = c;
/* else markup error: name too long */
break;
}
else{
int nc = 0;
name[name_chars] = '\0';
if(isdigit(name[0])){
nc = atoi(name);
}else if(!strcmp(name, "SPACE")){
nc = 32;
}else if(!strcmp(name, "RS")){
nc = 10;
}else if(!strcmp(name, "RE")){
nc = 13;
}
if(nc) WRITE(nc); /* else error: bad character reference */
if(c == ';') { REDUCE(data); }
else
/* terminate entity reference w/space or something */
{ SHIFT(data); }
}
case lt:
if(c == '/') { REDUCE(lt_slash); }
if(content == SGML_MIXED || content == SGML_ELEMENT){
if(c == '?') { REDUCE(pi); }
else if(c == '!') { REDUCE(lt_bang); }
else if(isalpha(c)) { end_tag = 0; SHIFT(tag); }
}
WRITE('<'); SHIFT(data);
case lt_slash:
if(isalpha(c)) { end_tag = 1; SHIFT(tag); }
else { WRITE('<'); WRITE('/'); SHIFT(data); }
case tag:
/* auxiliary state: end_tag */
ret = end_tag ? SGML_end_tag : SGML_start_tag;
DONE(c);
case pi: /* processing instruction (or markup declaraion) */
if(c == '>') { REDUCE(start); }
else if(c == EOF) { SHIFT(start); } /* error: EOF in pic */
else break;
case lt_bang:
if(c == '-') { REDUCE(lt_bang_dash); }
/*
* *** NON CONFORMING IMPLEMENTATION ***
* a letter here starts a markup declaration, which isn't supported
* a [ starts a marked section, which isn't supported.
* treat them like processing instructions.
*/
else if(c == '[' || isalpha(c)) { REDUCE(pi); }
else if(c == '>') { REDUCE(start); }
else{ WRITE('<'); WRITE('!'); SHIFT(data); }
case lt_bang_dash:
if(c == '-') { REDUCE(comment); }
else{ WRITE('<'); WRITE('!'); WRITE('-'); SHIFT(data); }
case comment:
if(c == '-') { REDUCE(comment_dash); }
else if(c == EOF) { DONE(c); } /* error: eof in comment */
else break;
case comment_dash:
if(c == '-') { REDUCE(ps); }
else if(c == EOF) { DONE(c); }/* error: eof in comment */
else { REDUCE(comment); }
case ps: /* parameter separator between -- and > */
if(c == EOF) { DONE(c); }
else if(isspace(c)) break;
else { REDUCE(start); }/* error if c !='>' */
}
c = (getch)(stream);
}
DONE(c); /* set up lookahead for next call */
#undef S
#undef LOOKAHEAD
#undef REDUCE
#undef SHIFT
#undef DONE
#undef WRITE
}
int
SGML_read_name(stream, getch, buf, inout_lookahead)
HMStream stream;
HMGetcProc* getch;
char* buf;
int* inout_lookahead;
{
int name_chars = 0;
int c = *inout_lookahead;
if(!isalpha(c)) return 0;
do{
if(name_chars <= SGML_NAMELEN)
buf[name_chars++] = toupper(c);
/* else error: name too long */
c = (getch)(stream);
}while(isalnum(c) || strchr(SGML_UCNMCHAR, c));
while(isspace(c))
c = (getch)(stream);
*inout_lookahead = c;
return name_chars;
}
int
SGML_read_value (stream,
getch,
buf,
inout_lookahead)
HMStream stream;
HMGetcProc* getch;
char* buf;
int* inout_lookahead;
{
int c; /* state machine input character */
enum { /* state machine states */
start,
literal,
and, and_hash, cref,
#if defined(SGML_SHORTTAG) || defined(GROK_UNQUOTED_LITERALS)
value,
#endif
ps
} state = start;
/* auxiliary state: */
char quote; /* which kind of quote */
int ret = 0; /* number of characters read */
char name[SGML_NAMELEN + 1]; /* entity name */
int name_chars;
#define LOOKAHEAD(n) (ret + n < SGML_LITLEN)
#define REDUCE(s) { state = (s); break; }
#define SHIFT(s) { state = (s); continue; }
#define DONE(c) { *inout_lookahead = (c); return ret; }
#define WRITE(c) { *buf++ = (c); ret++; }
/* prime the pump */
if((c = *inout_lookahead) == EOF)
c = (getch)(stream);
/* state machine...*/
while(ret < SGML_LITLEN){
switch(state){
case start:
if(c == EOF) return EOF;
else if(c == '"') { quote = c; REDUCE(literal); }
else if(c == '\'') { quote = c; REDUCE(literal); }
else if(isspace(c)) break;
#ifdef GROK_UNQUOTED_LITERALS
else if(!(c == '>')){
SHIFT(value);
}
#else
#ifdef SGML_SHORTTAG
else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
SHIFT(value);
}
#else
else { DONE(c); } /* error: illegal char in markup */
#endif
#endif
#ifdef GROK_UNQUOTED_LITERALS
case value:
if(c == EOF) { DONE(c); }
else if(isspace(c) || c == '>'){ SHIFT(ps); }
else{
WRITE(c);
break;
}
#else
#ifdef SGML_SHORTTAG
case value:
if(c == EOF) { DONE(c); }
else if(isalnum(c) || strchr(SGML_UCNMCHAR, c)){
WRITE(c);
break;
}else{ SHIFT(ps); }
#endif
#endif
case literal:
if(c == EOF) { DONE(c); }
else if(c == quote) { REDUCE(ps); }
else if(c == '&'){ REDUCE(and); }
else if(c == '\n' || c == '\t'){ WRITE(' '); break; }
else{
WRITE(c);
break;
}
case and:
if(c == '#') { REDUCE(and_hash); }
/*@@ else if(isalpha(c)) ... process entity reference */
else{ WRITE('&'); SHIFT(literal); }
case and_hash:
if(isalnum(c)){ name_chars = 0; SHIFT(cref); }
else{ WRITE('&'); WRITE('#'); SHIFT(literal); }
case cref:
/*@@ in case of xyz, this throws out xyz as error, when
it should only throw out x */
if(isdigit(c) || isalpha(c)
|| strchr(SGML_UCNMCHAR, c)){
if(name_chars < SGML_NAMELEN)
name[name_chars++] = c;
/* else markup error: name too long */
break;
}
else{
int nc = 0;
name[name_chars] = '\0';
if(isdigit(name[0])){
nc = atoi(name);
}else if(!strcmp(name, "SPACE")){
nc = 32;
}else if(!strcmp(name, "RS")){
nc = 10;
}else if(!strcmp(name, "RE")){
nc = 13;
}else
break;
if(nc) WRITE(nc); /* else error: bad character reference */
if(c == ';') { REDUCE(literal); }
else
/* terminate entity reference w/space or something */
{ SHIFT(literal); }
}
case ps: /* parameter separator between attributes */
if(isspace(c)) break;
else { DONE(c); }
}
c = (getch)(stream);
}
/* error: attribute value too long */
DONE(EOF); /* set lookahead to EOF for next call */
#undef S
#undef LOOKAHEAD
#undef REDUCE
#undef SHIFT
#undef DONE
#undef WRITE
}